/*
*  This file is part of ygg-brute
*  Copyright (c) 2020 ygg-brute authors
*  See LICENSE for licensing information
*/

#pragma once

#include <cstdint>
#include "cuda/common.cuh"

namespace cuda {

__device__ __forceinline__ uint32_t clz(uint32_t x)
{
    return __clz(x);
}

__device__ __forceinline__ void node_id_to_ipv6(const uint32_t w[8], uint8_t ipv6[16])
{
    uint32_t cnt = 0;
    uint32_t i = 0;

    for(; i < 4; ++i) {
        if(w[i] == 0xffffffff) {
            cnt += 32;
        } else {
            cnt += clz(~w[i]);
            break;
        }
    }

    i = (cnt + 1) / 32;
    uint32_t shift = (cnt + 1) % 32;

    uint32_t a0, a1, a2, a3;

    asm(R"({
        .reg.u32 t0, t1, t2, t3;
        shf.l.clamp.b32   t0, %5, %4, %9;
        shf.l.clamp.b32   t1, %6, %5, %9;
        shf.l.clamp.b32   t2, %7, %6, %9;
        shf.l.clamp.b32   t3, %8, %7, %9;
        shf.r.clamp.b32   %3, t3, t2, 16;
        shf.r.clamp.b32   %2, t2, t1, 16;
        shf.r.clamp.b32   %1, t1, t0, 16;
        shr.b32           %0, t0, 16;
    })"
    // %0 - %3
    : "=r"(a0), "=r"(a1), "=r"(a2), "=r"(a3)
    // %4 - %8
    : "r"(w[i]), "r"(w[i + 1]), "r"(w[i + 2]), "r"(w[i + 3]), "r"(w[i + 4]), "r"(shift)
    );

    uint32_t* ip = (uint32_t*)ipv6;
    ip[0] = bswap_u32(a0 | (cnt << 16) | 0x02000000);
    ip[1] = bswap_u32(a1);
    ip[2] = bswap_u32(a2);
    ip[3] = bswap_u32(a3);
}

}